Load Libraries

# Data manipulation and visualization 
library(tidyverse)
library(here)
library(janitor)
library(patchwork)
#library(grid)
#library(gridExtra)
#library(ghibli)
#library(RColorBrewer)
#library(gt)
#library(gtExtras)

# Data analysis/statistics 
library(car) #ANOVA
library(ggeffects)
library(effects) #dependency for ggeffects
library(ggResidpanel) #residual panel
library(GGally)
library(arm) #discrete.histogram() function
library(MASS) #required for arm package

Load the functions

dispersion <- function (model){
  sum(residuals(model, type = "pearson")^2)/(length(model$y)-length(model$coefficients))
}

Load data

The variables with are interested in are: - turf algae (column ‘ta’) - hard coral (column ‘hard_coral’) - sand (column ‘sand’) - crustose coralline algae (column ‘cca’)

fish <- read_csv(here("HW5","data","CRCP_Reef_Fish_Surveys_kole.csv"))

Question 1: Predictor Correlations

fish1 <- fish[,c("ta","hard_coral", "sand", "cca")]
ggpairs(fish1)

Question 2: Poisson distribution with single predictor model

For each of the four predictors, make single-predictor models where the response variable is kole counts. Consider which of the predictors could be transformed to reduce skew along the x-axis. For educational purposes, start by using a poisson distribution for the counts.

Turf Algae

#Based on the frequency distribution of the turf algae (ta) data, the predictor does not seem to be suited for the Poisson distribution. 
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$ta)

ta_lm <- lm(count ~ ta, data = fish)
summary(ta_lm)
## 
## Call:
## lm(formula = count ~ ta, data = fish)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.110 -11.896  -6.206   4.173 111.395 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.11012    3.22407   9.029  < 2e-16 ***
## ta          -0.26263    0.05234  -5.018 8.85e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.31 on 308 degrees of freedom
## Multiple R-squared:  0.07557,    Adjusted R-squared:  0.07257 
## F-statistic: 25.18 on 1 and 308 DF,  p-value: 8.854e-07
resid_panel(ta_lm, plots = c("resid", "qq", "lev", "hist"))

ta_glm <- glm(count ~ ta, family= poisson, data = fish)
summary(ta_glm)
## 
## Call:
## glm(formula = count ~ ta, family = poisson, data = fish)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.5784202  0.0352931   101.4   <2e-16 ***
## ta          -0.0177069  0.0006581   -26.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 7573.1  on 309  degrees of freedom
## Residual deviance: 6862.4  on 308  degrees of freedom
## AIC: 7783.4
## 
## Number of Fisher Scoring iterations: 6
resid_panel(ta_glm, plots = c("resid", "qq", "lev", "hist"))

resid_xpanel(ta_glm)

# lecture notes suggest using plot(log.y = TRUE) but I get an error. 
ggeff <- ggeffect(ta_glm) %>% plot(log_y = TRUE)
ggeff

Anova(ta_glm)
## Analysis of Deviance Table (Type II tests)
## 
## Response: count
##    LR Chisq Df Pr(>Chisq)    
## ta   710.68  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$ta, prob.col = 'orange', xlim = c(0, 100), ylim = c(0,0.18))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.15, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")

Overdispersion: Poisson and Binomial distributions makes assumptions on the variability of the data which is used for the calculations of likelihood. Larger variability will result in smaller Confidence Intervals (CI) and p-values, which is inaccurate.
With a dispersion of 30.10 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.

dispersion(ta_glm)
## [1] 30.10215

Hard Coral

#Based on the frequency distribution of the hard coral data, the predictor seem to be suited for the Poisson distribution. 
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$hard_coral)

coral_lm <- lm(count ~ hard_coral, data = fish)
summary(coral_lm)
## 
## Call:
## lm(formula = count ~ hard_coral, data = fish)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.440  -9.619  -6.177   3.039 111.774 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.31592    1.69524   3.136  0.00188 ** 
## hard_coral   0.43034    0.06312   6.818 4.88e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.69 on 308 degrees of freedom
## Multiple R-squared:  0.1311, Adjusted R-squared:  0.1283 
## F-statistic: 46.49 on 1 and 308 DF,  p-value: 4.876e-11
resid_panel(coral_lm, plots = c("resid", "qq", "lev", "hist"))

coral_glm <- glm(count ~ hard_coral, family= poisson, data = fish)
summary(coral_glm)
## 
## Call:
## glm(formula = count ~ hard_coral, family = poisson, data = fish)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 2.0379324  0.0257021   79.29   <2e-16 ***
## hard_coral  0.0243714  0.0006953   35.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 7573.1  on 309  degrees of freedom
## Residual deviance: 6481.7  on 308  degrees of freedom
## AIC: 7402.7
## 
## Number of Fisher Scoring iterations: 6
resid_panel(coral_glm, plots = c("resid", "qq", "lev", "hist"))

resid_xpanel(coral_glm)

ggeff <- ggeffect(coral_glm) %>% plot(log_y = TRUE)
ggeff

Anova(coral_glm)
## Analysis of Deviance Table (Type II tests)
## 
## Response: count
##            LR Chisq Df Pr(>Chisq)    
## hard_coral   1091.4  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$hard_coral, prob.col = 'orange', xlim = c(0, 75))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.20, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")

dispersion(coral_glm)
## [1] 28.13073

With a dispersion of 28.13 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.

Sand

#Based on the frequency distribution of the hard coral data, the predictor seem to be suited for the Poisson distribution. 
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$sand)

sand_glm <- glm(count ~ sand, family= poisson, data = fish)
summary(sand_glm)
## 
## Call:
## glm(formula = count ~ sand, family = poisson, data = fish)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.946727   0.018964  155.39   <2e-16 ***
## sand        -0.035770   0.001685  -21.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 7573.1  on 309  degrees of freedom
## Residual deviance: 6998.7  on 308  degrees of freedom
## AIC: 7919.6
## 
## Number of Fisher Scoring iterations: 6
resid_panel(sand_glm, plots = c("resid", "qq", "lev", "hist"))

resid_xpanel(sand_glm)

# lecture notes suggest using plot(log.y = TRUE) but I get an error. 
ggeff <- ggeffect(sand_glm) %>% plot(log_y = TRUE)
ggeff

Anova(sand_glm)
## Analysis of Deviance Table (Type II tests)
## 
## Response: count
##      LR Chisq Df Pr(>Chisq)    
## sand   574.42  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$sand, prob.col = 'orange', xlim = c(0, 75))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.20, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")

dispersion(sand_glm)
## [1] 30.97697

With a dispersion of 30.98 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.

CCA

#Based on the frequency distribution of the hard coral data, the predictor seem to be suited for the Poisson distribution. 
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$cca)

cca_glm <- glm(count ~ cca, family= poisson, data = fish)
summary(cca_glm)
## 
## Call:
## glm(formula = count ~ cca, family = poisson, data = fish)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 2.270516   0.021449  105.85   <2e-16 ***
## cca         0.034430   0.001153   29.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 7573.1  on 309  degrees of freedom
## Residual deviance: 6846.7  on 308  degrees of freedom
## AIC: 7767.6
## 
## Number of Fisher Scoring iterations: 6
resid_panel(cca_glm, plots = c("resid", "qq", "lev", "hist"))

resid_xpanel(cca_glm)

ggeff <- ggeffect(cca_glm) %>% plot(log_y = TRUE)
ggeff

Anova(cca_glm)
## Analysis of Deviance Table (Type II tests)
## 
## Response: count
##     LR Chisq Df Pr(>Chisq)    
## cca   726.46  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$cca, prob.col = 'orange', xlim = c(0, 75))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.20, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")

dispersion(cca_glm)
## [1] 30.8293

With a dispersion of 30.83 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.

Answer: Relationship between Kole and the predictors

  • There is a positive relationship between Kole and hard corals, and CCA. An increase in those predictors will result in an significant increase in Kole count.
  • There is a negative relationship between Kole and turf algae, and sand. An increase in those predictors will result in an significant decrease in Kole count

Question 3: dealing with overdispersion in a model

To deal with over-dispersion, we can use a negative binomial distribution or use random effects.

Turf Algae

#negative binomial distribution 
ta_nb <- glm.nb(count ~ ta, data = fish)
summary(ta_nb)
## 
## Call:
## glm.nb(formula = count ~ ta, data = fish, init.theta = 0.3750484251, 
##     link = log)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.64697    0.26206   13.92  < 2e-16 ***
## ta          -0.01900    0.00427   -4.45 8.57e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.375) family taken to be 1)
## 
##     Null deviance: 365.50  on 309  degrees of freedom
## Residual deviance: 346.15  on 308  degrees of freedom
## AIC: 2085.5
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3750 
##           Std. Err.:  0.0325 
## 
##  2 x log-likelihood:  -2079.4990
# lecture notes suggest using plot(log.y = TRUE) but I get an error. 
ggeff <- ggeffect(ta_nb) %>% plot(log_y=TRUE)
ggeff

Anova(ta_nb)
## Analysis of Deviance Table (Type II tests)
## 
## Response: count
##    LR Chisq Df Pr(>Chisq)    
## ta   19.342  1  1.093e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Overdispersion: With a dispersion of 0.98, the overdispersion is fixed.

dispersion(ta_nb)
## [1] 0.9872545

Hard Corals

#negative binomial distribution 
hc_nb <- glm.nb(count ~ hard_coral, data = fish)
summary(hc_nb)
## 
## Call:
## glm.nb(formula = count ~ hard_coral, data = fish, init.theta = 0.3931303705, 
##     link = log)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 1.858032   0.140369  13.237  < 2e-16 ***
## hard_coral  0.031720   0.005176   6.129 8.86e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3931) family taken to be 1)
## 
##     Null deviance: 379.65  on 309  degrees of freedom
## Residual deviance: 346.22  on 308  degrees of freedom
## AIC: 2072.6
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3931 
##           Std. Err.:  0.0344 
## 
##  2 x log-likelihood:  -2066.6070
# lecture notes suggest using plot(log.y = TRUE) but I get an error. 
ggeff <- ggeffect(hc_nb) %>% plot(log_y=TRUE)
ggeff

Anova(hc_nb)
## Analysis of Deviance Table (Type II tests)
## 
## Response: count
##            LR Chisq Df Pr(>Chisq)    
## hard_coral   33.437  1  7.363e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Overdispersion: With a dispersion of 1.08, the overdispersion is fixed.

dispersion(hc_nb)
## [1] 1.075455

Sand

#negative binomial distribution 
sand_nb <- glm.nb(count ~ sand, data = fish)
summary(sand_nb)
## 
## Call:
## glm.nb(formula = count ~ sand, data = fish, init.theta = 0.3729581071, 
##     link = log)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.941783   0.124216   23.68  < 2e-16 ***
## sand        -0.035008   0.007544   -4.64 3.48e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.373) family taken to be 1)
## 
##     Null deviance: 363.85  on 309  degrees of freedom
## Residual deviance: 345.96  on 308  degrees of freedom
## AIC: 2086.9
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3730 
##           Std. Err.:  0.0323 
## 
##  2 x log-likelihood:  -2080.8530
# lecture notes suggest using plot(log.y = TRUE) but I get an error. 
ggeff <- ggeffect(sand_nb) %>% plot(log_y=TRUE)
ggeff

Anova(sand_nb)
## Analysis of Deviance Table (Type II tests)
## 
## Response: count
##      LR Chisq Df Pr(>Chisq)    
## sand   17.888  1  2.343e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Overdispersion: With a dispersion of 0.95, the overdispersion is fixed.

dispersion(sand_nb)
## [1] 0.9493442

CCA

#negative binomial distribution 
cca_nb <- glm.nb(count ~ cca, data = fish)
summary(cca_nb)
## 
## Call:
## glm.nb(formula = count ~ cca, data = fish, init.theta = 0.3839858353, 
##     link = log)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 1.999507   0.126584  15.796  < 2e-16 ***
## cca         0.060098   0.009659   6.222 4.91e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.384) family taken to be 1)
## 
##     Null deviance: 372.51  on 309  degrees of freedom
## Residual deviance: 345.99  on 308  degrees of freedom
## AIC: 2078.8
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3840 
##           Std. Err.:  0.0334 
## 
##  2 x log-likelihood:  -2072.8340
# lecture notes suggest using plot(log.y = TRUE) but I get an error. 
ggeff <- ggeffect(cca_nb) %>% plot(log_y=TRUE)
ggeff

Anova(cca_nb)
## Analysis of Deviance Table (Type II tests)
## 
## Response: count
##     LR Chisq Df Pr(>Chisq)    
## cca   26.529  1  2.597e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Overdispersion: With a dispersion of 0.99, the overdispersion is fixed.

dispersion(ta_nb)
## [1] 0.9872545

Answer

The dispersion values became closer to one using the negative binomial. This is because in this model, the variance is proportional to the mean and data points become corrected by a factor of “Phi”.

Question 4: Including all substrate types in a single model

For this model, I tried a regular glm() with Poisson distribution and the dispersion was about 27. I decided to do negative binomial model to minimize the dispersion.

full_glm <- glm.nb(count ~ ta + hard_coral + sand + cca, data = fish)
summary(full_glm)
## 
## Call:
## glm.nb(formula = count ~ ta + hard_coral + sand + cca, data = fish, 
##     init.theta = 0.4547766048, link = log)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.85507    1.66061  -1.719  0.08556 .  
## ta           0.04659    0.01703   2.736  0.00622 ** 
## hard_coral   0.07439    0.01716   4.336 1.45e-05 ***
## sand         0.01835    0.01825   1.006  0.31453    
## cca          0.09260    0.02028   4.566 4.96e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.4548) family taken to be 1)
## 
##     Null deviance: 426.74  on 309  degrees of freedom
## Residual deviance: 345.03  on 305  degrees of freedom
## AIC: 2038.2
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.4548 
##           Std. Err.:  0.0410 
## 
##  2 x log-likelihood:  -2026.2040
# lecture notes suggest using plot(log.y = TRUE) but I get an error. 
ggeff <- ggeffect(full_glm) %>% plot(log_y=TRUE)

ggeff
## $ta

## 
## $hard_coral

## 
## $sand

## 
## $cca

Anova(full_glm)
## Analysis of Deviance Table (Type II tests)
## 
## Response: count
##            LR Chisq Df Pr(>Chisq)    
## ta           6.2887  1  0.0121507 *  
## hard_coral  14.3924  1  0.0001484 ***
## sand         1.0950  1  0.2953666    
## cca         16.4564  1  4.978e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Overdispersion: With a dispersion of 0.99, the overdispersion is fixed.

full_glm <- glm.nb(count ~ ta + hard_coral + sand + cca, data = fish)
dispersion(full_glm)
## [1] 1.512709

Diagnostic plots

The resid_panel function did not work with the glm.nb().

Answer

  • Turf Algae: The effects shifted from a negative slope to a positive slope. The Chi Square value is 1000x greater in the new model.
  • Hard Corals: The effect slope remains positive but the magnitude of the effect is much greater in the new model. The Chi Square value is 100,000x greater in the new model.
  • Sand: The effect slope shifted from a negative slope to a positive slope and the magnitude is greater in the new model. The Chi Square value is 10,000x greater in the new model.
  • CCA: The effect slope remains positive but the magnitude of the effect is much greater in the new model. The Chi Square value is 100x greater in the new model.
  • Explanation to changes: The model is now accounting for the all 4 benthic cover types and is able to consider the effects of each of the variable on one another instead of consider an individual effect.

Closing thoughts

Overall, it looks like corals and CCA are more associated with higher counts of Kole. We can possibly perform a linear regression and calculate the R-squared and p-values to estimate the significance of the correlation.